imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv

# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor

    def throw(df, fraud_rate):  # 사기 거래 비율에 맞춰 버려지는 함수!
        df1 = df[df['is_fraud'] == 1].copy()
        df0 = df[df['is_fraud'] == 0].copy()
        df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
        df0_down = df0.sample(frac=df0_downsample, random_state=42)
        df_p = pd.concat([df1, df0_down])
        return df_p
    
    def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
        n = len(data_frame)
    
        # 사기 거래와 정상 거래를 분리
        fraud_data = data_frame[data_frame['is_fraud'] == 1]
        normal_data = data_frame[data_frame['is_fraud'] == 0]

        # 테스트 데이터 크기 계산
        test_samples = int(test_fraud_rate * (n * test_rate))
        remaining_test_samples = int(n * test_rate) - test_samples
    
        # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
        test_fraud_data = fraud_data.sample(n=test_samples, replace=False)
        test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False)

        # 테스트 데이터 합치기
        test_data = pd.concat([test_normal_data, test_fraud_data])

        # 훈련 데이터 생성
        train_data = data_frame[~data_frame.index.isin(test_data.index)]

        return train_data, test_data
    
    def concat(df_tr, df_tst):   
        df = pd.concat([df_tr, df_tst])
        train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
        test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
        mask = (train_mask, test_mask)
        return df, mask
        
    def evaluation(y, yhat):
        metrics = [sklearn.metrics.accuracy_score,
                   sklearn.metrics.precision_score,
                   sklearn.metrics.recall_score,
                   sklearn.metrics.f1_score,
                   sklearn.metrics.roc_auc_score]
        return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
        
    def compute_time_difference(group):
        n = len(group)
        result = []
        for i in range(n):
            for j in range(n):
                time_difference = abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
                result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
        return result

    def edge_index_save(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
        filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        
        while os.path.exists(filename):
            self.save_attempt += 1
            filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        np.save(filename, edge_index)
        #tetha = edge_index_plust_itme[:,].mean()
    
        
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index
    
    def edge_index(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
       # filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        
        # while os.path.exists(filename):
        #     self.save_attempt += 1
        #     filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        # np.save(filename, edge_index)
        #tetha = edge_index_plust_itme[:,].mean()
    
        
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	2019-01-01 00:00:00	2.703190e+15	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315	0
1	2019-01-01 00:00:00	6.304230e+11	fraud_Heller, Gutmann and Zieme	grocery_pos	107.23	Stephanie	Gill	F	43039 Riley Greens Suite 393	Orient	...	48.8878	-118.2105	149	Special educational needs teacher	1978-06-21	1f76529f8574734946361c461b024d99	1325376044	49.159047	-118.186462	0
2	2019-01-01 00:00:00	3.885950e+13	fraud_Lind-Buckridge	entertainment	220.11	Edward	Sanchez	M	594 White Dale Suite 530	Malad City	...	42.1808	-112.2620	4154	Nature conservation officer	1962-01-19	a1a22d70485983eac12b5b88dad1cf95	1325376051	43.150704	-112.154481	0
3	2019-01-01 00:01:00	3.534090e+15	fraud_Kutch, Hermiston and Farrell	gas_transport	45.00	Jeremy	White	M	9443 Cynthia Court Apt. 038	Boulder	...	46.2306	-112.1138	1939	Patent attorney	1967-01-12	6b849c168bdad6f867558c3793159a81	1325376076	47.034331	-112.561071	0
4	2019-01-01 00:03:00	3.755340e+14	fraud_Keeling-Crist	misc_pos	41.96	Tyler	Garcia	M	408 Bradley Rest	Doe Hill	...	38.4207	-79.4629	99	Dance movement psychotherapist	1986-03-28	a41d7549acf90789359a9aa5346dcb46	1325376186	38.674999	-78.632459	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1048570	2020-03-10 16:07:00	6.011980e+15	fraud_Fadel Inc	health_fitness	77.00	Haley	Wagner	F	05561 Farrell Crescent	Annapolis	...	39.0305	-76.5515	92106	Accountant, chartered certified	1943-05-28	45ecd198c65e81e597db22e8d2ef7361	1362931649	38.779464	-76.317042	0
1048571	2020-03-10 16:07:00	4.839040e+15	fraud_Cremin, Hamill and Reichel	misc_pos	116.94	Meredith	Campbell	F	043 Hanson Turnpike	Hedrick	...	41.1826	-92.3097	1583	Geochemist	1999-06-28	c00ce51c6ebb7657474a77b9e0b51f34	1362931670	41.400318	-92.726724	0
1048572	2020-03-10 16:08:00	5.718440e+11	fraud_O'Connell, Botsford and Hand	home	21.27	Susan	Mills	F	005 Cody Estates	Louisville	...	38.2507	-85.7476	736284	Engineering geologist	1952-04-02	17c9dc8b2a6449ca2473726346e58e6c	1362931711	37.293339	-84.798122	0
1048573	2020-03-10 16:08:00	4.646850e+18	fraud_Thompson-Gleason	health_fitness	9.52	Julia	Bell	F	576 House Crossroad	West Sayville	...	40.7320	-73.1000	4056	Film/video editor	1990-06-25	5ca650881b48a6a38754f841c23b77ab	1362931718	39.773077	-72.213209	0
1048574	2020-03-10 16:08:00	2.283740e+15	fraud_Buckridge PLC	misc_pos	6.81	Shannon	Williams	F	9345 Spencer Junctions Suite 183	Alpharetta	...	34.0770	-84.3033	165556	Prison officer	1997-12-27	8d0a575fe635bbde12f1a2bffc126731	1362931730	33.601468	-83.891921	0

1048575 rows × 22 columns

df = throw(fraudTrain, 0.2)

df_tr,df_tst = sklearn.model_selection.train_test_split(df, random_state=42)

# df_tr, df_tst = split_dataframe(df, 0.4)

# df2, mask = concat(df_tr, df_tst)

# def compute_time_difference2(group):
#     n = len(group)
#     result = []
#     for i in range(n):
#         for j in range(n):
#             time_difference = abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
#             result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
#     return result
# def edge_index2(df, unique_col, theta, gamma, hms='s'):
#     groups = df.groupby(unique_col)
#     edge_index = np.array([item for sublist in (compute_time_difference2(group) for _, group in groups) for item in sublist])
#     edge_index = edge_index.astype(np.float64)
#     filename = f"edge_index{str(unique_col).replace(' ', '').replace('_', '')}.npy"  # 저장
#     np.save(filename, edge_index)
#     edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
#     edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
#     return edge_index

# edge_index2(df,'cc_num', 8.028000e+04, 0.3) # 시도 1

Autogluon

tr = TabularDataset(df_tr)
tst = TabularDataset(df_tst)

predictr = TabularPredictor("is_fraud")

No path specified. Models will be saved in: "AutogluonModels/ag-20240123_061134/"

predictr.fit(tr)

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240123_061134/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   625.58 GB / 982.82 GB (63.7%)
Train Data Rows:    22522
Train Data Columns: 21
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [0, 1]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    31748.69 MB
    Train Data (Original)  Memory Usage: 19.12 MB (0.1% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
            Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
        Fitting CategoryFeatureGenerator...
            Fitting CategoryMemoryMinimizeFeatureGenerator...
        Fitting DatetimeFeatureGenerator...
        Fitting TextSpecialFeatureGenerator...
            Fitting BinnedFeatureGenerator...
            Fitting DropDuplicatesFeatureGenerator...
        Fitting TextNgramFeatureGenerator...
            Fitting CountVectorizer for text features: ['street']
            CountVectorizer fit with vocabulary size = 2
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Unused Original Features (Count: 1): ['trans_num']
        These features were not used to generate any of the output features. Add a feature generator compatible with these features to utilize them.
        Features can also be unused if they carry very little information, such as being categorical but having almost entirely unique values or being duplicates of other features.
        These features do not need to be present at inference time.
        ('object', []) : 1 | ['trans_num']
    Types of features in original data (raw dtype, special dtypes):
        ('datetime', [])                   : 1 | ['trans_date_trans_time']
        ('float', [])                      : 6 | ['cc_num', 'amt', 'lat', 'long', 'merch_lat', ...]
        ('int', [])                        : 3 | ['zip', 'city_pop', 'unix_time']
        ('object', [])                     : 8 | ['merchant', 'category', 'first', 'last', 'gender', ...]
        ('object', ['datetime_as_object']) : 1 | ['dob']
        ('object', ['text'])               : 1 | ['street']
    Types of features in processed data (raw dtype, special dtypes):
        ('category', [])                    :  7 | ['merchant', 'category', 'first', 'last', 'city', ...]
        ('category', ['text_as_category'])  :  1 | ['street']
        ('float', [])                       :  6 | ['cc_num', 'amt', 'lat', 'long', 'merch_lat', ...]
        ('int', [])                         :  3 | ['zip', 'city_pop', 'unix_time']
        ('int', ['binned', 'text_special']) :  8 | ['street.char_count', 'street.word_count', 'street.capital_ratio', 'street.lower_ratio', 'street.digit_ratio', ...]
        ('int', ['bool'])                   :  1 | ['gender']
        ('int', ['datetime_as_int'])        : 10 | ['trans_date_trans_time', 'trans_date_trans_time.year', 'trans_date_trans_time.month', 'trans_date_trans_time.day', 'trans_date_trans_time.dayofweek', ...]
        ('int', ['text_ngram'])             :  1 | ['__nlp__.suite']
    1.0s = Fit runtime
    20 features in original data used to generate 37 features in processed data.
    Train Data (Processed) Memory Usage: 3.99 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 1.02s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 20269, Val Rows: 2253
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f58e3f2b700>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.9188   = Validation score   (accuracy)
    0.01s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f59cfd29790>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.9436   = Validation score   (accuracy)
    0.01s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: LightGBMXT ...
    0.9796   = Validation score   (accuracy)
    1.47s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: LightGBM ...
    0.9822   = Validation score   (accuracy)
    2.52s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestGini ...
    0.9698   = Validation score   (accuracy)
    1.01s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.9685   = Validation score   (accuracy)
    1.11s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: CatBoost ...
    0.992    = Validation score   (accuracy)
    5.36s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini ...
    0.9711   = Validation score   (accuracy)
    0.49s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.9729   = Validation score   (accuracy)
    0.49s    = Training   runtime
    0.05s    = Validation runtime
Fitting model: NeuralNetFastAI ...
    0.976    = Validation score   (accuracy)
    18.5s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: XGBoost ...
    0.9831   = Validation score   (accuracy)
    3.18s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.9569   = Validation score   (accuracy)
    17.3s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.9765   = Validation score   (accuracy)
    1.35s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.9969   = Validation score   (accuracy)
    0.76s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 55.57s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240123_061134/")

[1000]  valid_set's binary_error: 0.0208611
[1000]  valid_set's binary_error: 0.0217488
[2000]  valid_set's binary_error: 0.0186418

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f59153bdf10>

predictr.leaderboard()

                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.996893       0.119584   9.796088                0.002782           0.762813            2       True         14
1              CatBoost   0.992011       0.013630   5.359717                0.013630           5.359717            1       True          7
2               XGBoost   0.983134       0.017992   3.177904                0.017992           3.177904            1       True         11
3              LightGBM   0.982246       0.036582   2.519265                0.036582           2.519265            1       True          4
4            LightGBMXT   0.979583       0.028434   1.471005                0.028434           1.471005            1       True          3
5         LightGBMLarge   0.976476       0.015682   1.346799                0.015682           1.346799            1       True         13
6       NeuralNetFastAI   0.976032       0.029451  18.500963                0.029451          18.500963            1       True         10
7        ExtraTreesEntr   0.972925       0.046994   0.492940                0.046994           0.492940            1       True          9
8        ExtraTreesGini   0.971150       0.044906   0.486165                0.044906           0.486165            1       True          8
9      RandomForestGini   0.969818       0.043695   1.013964                0.043695           1.013964            1       True          5
10     RandomForestEntr   0.968486       0.042892   1.113244                0.042892           1.113244            1       True          6
11       NeuralNetTorch   0.956946       0.023545  17.300473                0.023545          17.300473            1       True         12
12       KNeighborsDist   0.943631       0.040274   0.009490                0.040274           0.009490            1       True          2
13       KNeighborsUnif   0.918775       0.032133   0.008666                0.032133           0.008666            1       True          1

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	WeightedEnsemble_L2	0.996893	0.119584	9.796088	0.002782	0.762813	2	True	14
1	CatBoost	0.992011	0.013630	5.359717	0.013630	5.359717	1	True	7
2	XGBoost	0.983134	0.017992	3.177904	0.017992	3.177904	1	True	11
3	LightGBM	0.982246	0.036582	2.519265	0.036582	2.519265	1	True	4
4	LightGBMXT	0.979583	0.028434	1.471005	0.028434	1.471005	1	True	3
5	LightGBMLarge	0.976476	0.015682	1.346799	0.015682	1.346799	1	True	13
6	NeuralNetFastAI	0.976032	0.029451	18.500963	0.029451	18.500963	1	True	10
7	ExtraTreesEntr	0.972925	0.046994	0.492940	0.046994	0.492940	1	True	9
8	ExtraTreesGini	0.971150	0.044906	0.486165	0.044906	0.486165	1	True	8
9	RandomForestGini	0.969818	0.043695	1.013964	0.043695	1.013964	1	True	5
10	RandomForestEntr	0.968486	0.042892	1.113244	0.042892	1.113244	1	True	6
11	NeuralNetTorch	0.956946	0.023545	17.300473	0.023545	17.300473	1	True	12
12	KNeighborsDist	0.943631	0.040274	0.009490	0.040274	0.009490	1	True	2
13	KNeighborsUnif	0.918775	0.032133	0.008666	0.032133	0.008666	1	True	1

(tr.is_fraud == predictr.predict(tr)).mean()

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f5910db6e50>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'

0.9996891927892727

(tst.is_fraud == predictr.predict(tst)).mean()

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f58916edca0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'

0.9949387320191796

yhat = predictr.predict(tst)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f59cfd29280>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'

y = df_tst.is_fraud

evaluation(y,yhat)

	accuracy_score	precision_score	recall_score	f1_score	roc_auc_score
0	0.994939	0.991892	0.982597	0.987223	0.990301

    def evaluation(y, yhat):
        metrics = [sklearn.metrics.accuracy_score,
                   sklearn.metrics.precision_score,
                   sklearn.metrics.recall_score,
                   sklearn.metrics.f1_score,
                   sklearn.metrics.roc_auc_score]
        return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})